{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 工具包导入&数据读取\n",
    "## 工具包导入"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import lightgbm as lgb\n",
    "from scipy.special import logit"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据读取 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '/data/Data_JieZhang/santander-customer-transaction-prediction/' \n",
    "train_df = pd.read_csv(path + \"train.csv\")\n",
    "test_df  = pd.read_csv(path + \"test.csv\")\n",
    "features = [x for x in train_df.columns if x.startswith(\"var\")] "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 删除Fake数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "hist_df = pd.DataFrame()\n",
    "for var in features:\n",
    "    var_stats = train_df[var].append(test_df[var]).value_counts()\n",
    "    hist_df[var] = pd.Series(test_df[var]).map(var_stats)\n",
    "    hist_df[var] = hist_df[var] > 1\n",
    "\n",
    "ind = hist_df.sum(axis=1) != 200 # 假的数据\n",
    "var_stats = {var:train_df[var].append(test_df[ind][var]).value_counts() for var in features}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型训练&预测\n",
    "\n",
    "在之前Chris Deotte开源过一篇文章: Modified Naive Bayes - Santander - [0.899], 在他的kernel中,我们发现特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "var_0\n",
      "var_1\n",
      "var_2\n",
      "var_3\n",
      "var_4\n",
      "var_5\n",
      "var_6\n",
      "var_7\n",
      "var_8\n",
      "var_9\n",
      "var_10\n",
      "var_11\n",
      "var_12\n",
      "var_13\n",
      "var_14\n",
      "var_15\n",
      "var_16\n",
      "var_17\n",
      "var_18\n",
      "var_19\n",
      "var_20\n",
      "var_21\n",
      "var_22\n",
      "var_23\n",
      "var_24\n",
      "var_25\n",
      "var_26\n",
      "var_27\n",
      "var_28\n",
      "var_29\n",
      "var_30\n",
      "var_31\n",
      "var_32\n",
      "var_33\n",
      "var_34\n",
      "var_35\n",
      "var_36\n",
      "var_37\n",
      "var_38\n",
      "var_39\n",
      "var_40\n",
      "var_41\n",
      "var_42\n",
      "var_43\n",
      "var_44\n",
      "var_45\n",
      "var_46\n",
      "var_47\n",
      "var_48\n",
      "var_49\n",
      "var_50\n",
      "var_51\n",
      "var_52\n",
      "var_53\n",
      "var_54\n",
      "var_55\n",
      "var_56\n",
      "var_57\n",
      "var_58\n",
      "var_59\n",
      "var_60\n",
      "var_61\n",
      "var_62\n",
      "var_63\n",
      "var_64\n",
      "var_65\n",
      "var_66\n",
      "var_67\n",
      "var_68\n",
      "var_69\n",
      "var_70\n",
      "var_71\n",
      "var_72\n",
      "var_73\n",
      "var_74\n",
      "var_75\n",
      "var_76\n",
      "var_77\n",
      "var_78\n",
      "var_79\n",
      "var_80\n",
      "var_81\n",
      "var_82\n",
      "var_83\n",
      "var_84\n",
      "var_85\n",
      "var_86\n",
      "var_87\n",
      "var_88\n",
      "var_89\n",
      "var_90\n",
      "var_91\n",
      "var_92\n",
      "var_93\n",
      "var_94\n",
      "var_95\n",
      "var_96\n",
      "var_97\n",
      "var_98\n",
      "var_99\n",
      "var_100\n",
      "var_101\n",
      "var_102\n",
      "var_103\n",
      "var_104\n",
      "var_105\n",
      "var_106\n",
      "var_107\n",
      "var_108\n",
      "var_109\n",
      "var_110\n",
      "var_111\n",
      "var_112\n",
      "var_113\n",
      "var_114\n",
      "var_115\n",
      "var_116\n",
      "var_117\n",
      "var_118\n",
      "var_119\n",
      "var_120\n",
      "var_121\n",
      "var_122\n",
      "var_123\n",
      "var_124\n",
      "var_125\n",
      "var_126\n",
      "var_127\n",
      "var_128\n",
      "var_129\n",
      "var_130\n",
      "var_131\n",
      "var_132\n",
      "var_133\n",
      "var_134\n",
      "var_135\n",
      "var_136\n",
      "var_137\n",
      "var_138\n",
      "var_139\n",
      "var_140\n",
      "var_141\n",
      "var_142\n",
      "var_143\n",
      "var_144\n",
      "var_145\n",
      "var_146\n",
      "var_147\n",
      "var_148\n",
      "var_149\n",
      "var_150\n",
      "var_151\n",
      "var_152\n",
      "var_153\n",
      "var_154\n",
      "var_155\n",
      "var_156\n",
      "var_157\n",
      "var_158\n",
      "var_159\n",
      "var_160\n",
      "var_161\n",
      "var_162\n",
      "var_163\n",
      "var_164\n",
      "var_165\n",
      "var_166\n",
      "var_167\n",
      "var_168\n",
      "var_169\n",
      "var_170\n",
      "var_171\n",
      "var_172\n",
      "var_173\n",
      "var_174\n",
      "var_175\n",
      "var_176\n",
      "var_177\n",
      "var_178\n",
      "var_179\n",
      "var_180\n",
      "var_181\n",
      "var_182\n",
      "var_183\n",
      "var_184\n",
      "var_185\n",
      "var_186\n",
      "var_187\n",
      "var_188\n",
      "var_189\n",
      "var_190\n",
      "var_191\n",
      "var_192\n",
      "var_193\n",
      "var_194\n",
      "var_195\n",
      "var_196\n",
      "var_197\n",
      "var_198\n",
      "var_199\n"
     ]
    }
   ],
   "source": [
    "pred = 0\n",
    "for var in features:\n",
    "    print(var)\n",
    "    model = lgb.LGBMClassifier(**{\n",
    "        'learning_rate':0.05, 'max_bin': 165, 'max_depth': 5, 'min_child_samples': 150,\n",
    "        'min_child_weight': 0.1, 'min_split_gain': 0.0018, 'n_estimators': 41,\n",
    "        'num_leaves': 6, 'reg_alpha': 2.0, 'reg_lambda': 2.54, 'objective': 'binary', 'n_jobs': -1})\n",
    "    \n",
    "    model = model.fit(np.hstack([train_df[var].values.reshape(-1,1),\n",
    "                                 train_df[var].map(var_stats[var]).values.reshape(-1,1)]),\n",
    "                               train_df[\"target\"].values)\n",
    "    pred += logit(model.predict_proba(np.hstack([test_df[var].values.reshape(-1,1),\n",
    "                                 test_df[var].map(var_stats[var]).values.reshape(-1,1)]))[:,1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "pd.DataFrame({\"ID_code\":test_df[\"ID_code\"], \"target\":pred}).to_csv(\"zoo.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
